{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python377jvsc74a57bd0086b6e2ac5931a4f7a51b812f0475290a145e7d51f3e22b53c52adf5c273fe30",
   "display_name": "Python 3.7.7 64-bit ('py37': conda)"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import sklearn\n",
    "from sklearn import model_selection\n",
    "import tqdm\n",
    "from collections import Counter\n",
    "import random\n",
    "import numpy as np\n",
    "from typing import List, Dict\n",
    "import spacy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "PROFS = ['professor', 'physician', 'attorney', 'photographer', 'journalist', 'nurse', 'psychologist', 'teacher',\n",
    "'dentist', 'surgeon', 'architect', 'painter', 'model', 'poet', 'filmmaker', 'software_engineer',\n",
    "'accountant', 'composer', 'dietitian', 'comedian', 'chiropractor', 'pastor', 'paralegal', 'yoga_teacher',\n",
    "'dj', 'interior_designer', 'personal_trainer', 'rapper']\n",
    "\n",
    "# a dictionary for uniting similar professions, according to De-Arteaga, Maria, et al. 2019\n",
    "PROF2UNIFIED_PROF = {\"associate professor\": \"professor\", \"assistant professor\": \"professor\", \"software engineer\": \"software_engineer\", \"psychotherapist\": \"psychologist\", \"orthopedic surgeon\": \"surgeon\", \"trial lawyer\": \"attorney\",\"plastic surgeon\": \"surgeon\",  \"trial attorney\": \"attorney\", \"senior software engineer\": \"software_engineer\", \"interior designer\": \"interior_designer\", \"certified public accountant\": \"accountant\", \"cpa\": \"accountant\", \"neurosurgeon\": \"surgeon\", \"yoga teacher\": \"yoga_teacher\", \"nutritionist\": \"dietitian\", \"personal trainer\": \"personal_trainer\", \"certified personal trainer\": \"personal_trainer\", \"yoga instructor\": \"yoga_teacher\"}\n",
    "\n",
    "\n",
    "def load_data(fname):\n",
    "        \"\"\"\n",
    "        Load the BIOS dataset from De-Arteaga, Maria, et al. 2019\n",
    "        \"\"\"\n",
    "        with open(fname, \"rb\") as f:\n",
    "\n",
    "                data = pickle.load(f)\n",
    "\n",
    "        return data\n",
    "\n",
    "def preprocess(data: List[dict]):\n",
    "\n",
    "        # unite similar professions, tokenize\n",
    "        \"\"\"\n",
    "        :param data: List[dictionary]\n",
    "        :return: none\n",
    "        changes the data dictionaries in place, uniting similar professions.\n",
    "        \"\"\"\n",
    "    \n",
    "        for i, data_dict in enumerate(data):\n",
    "                prof = data_dict[\"raw_title\"].lower()\n",
    "                data[i][\"raw_title\"] = PROF2UNIFIED_PROF[prof] if prof in PROF2UNIFIED_PROF else prof\n",
    "\n",
    "def pickle_data(data, name):\n",
    "        with open(name+\".pickle\", \"wb\") as f:\n",
    "                pickle.dump(data, f)\n",
    "\n",
    "def write_to_file(dictionary, name):\n",
    "\n",
    "        with open(name+\".txt\", \"w\", encoding = \"utf-8\") as f:\n",
    "\n",
    "                for k,v in sorted(dictionary.items()):\n",
    "\n",
    "                        f.write(str(k) + \"\\t\" + str(v) + \"\\n\")\n",
    "\n",
    "def split_train_dev_test(data: List[dict], output_dir: str, vocab_size: int):\n",
    "        \"\"\"\n",
    "        :param data: list of dictionaries, each containing the biography of a single person\n",
    "        :param vocab_size: how many words to keep\n",
    "        :return: none. writes the dataset to files\n",
    "        \"\"\"\n",
    "        g2i, i2g = {\"m\": 0, \"f\": 1}, {1: \"f\", 0: \"m\"}\n",
    "        all_profs = list(sorted(set([d[\"raw_title\"] for d in data])))\n",
    "        all_words = []\n",
    "\n",
    "        for d in data:\n",
    "                all_words.extend(d[\"raw\"].split(\" \"))\n",
    "\n",
    "        words_counter = Counter(all_words)\n",
    "        common_words, counts = map(list, zip(*words_counter.most_common(vocab_size)))\n",
    "        common_words = [\"<UNK>\"] + common_words\n",
    "\n",
    "        p2i = {p: i for i, p in enumerate(sorted(all_profs))}\n",
    "        w2i = {w: i for i, w in enumerate(sorted(common_words))}\n",
    "\n",
    "        all_data = []\n",
    "        nlp = spacy.load(\"en_core_web_sm\") \n",
    "\n",
    "        for entry in tqdm.tqdm(data, total=len(data)):\n",
    "                gender, prof = entry[\"gender\"].lower(), entry[\"raw_title\"].lower()\n",
    "                raw, start_index = entry[\"raw\"], entry[\"start_pos\"]\n",
    "                hard_text = raw[start_index + 1:] # the biography without the first line\n",
    "                hard_text_tokenized =  list(nlp.pipe([hard_text], disable=[\"tagger\", \"parser\", \"ner\"]))[0]\n",
    "\n",
    "                hard_text_tokenized = \" \".join([tok.text for tok in hard_text_tokenized])\n",
    "\n",
    "                text_without_gender = entry[\"bio\"] # the text, with all gendered words and names removed\n",
    "                all_data.append({\"g\": gender, \"p\": prof, \"text\": raw, \"start\": start_index, \"hard_text\": hard_text, \"text_without_gender\": text_without_gender, \"hard_text_tokenized\": hard_text_tokenized})\n",
    "\n",
    "        random.seed(0)\n",
    "        np.random.seed(0)\n",
    "\n",
    "        train_prop, dev_prop, test_prop = 0.65, 0.1, 0.25      \n",
    "        train, dev, test = [], [], []\n",
    "                \n",
    "        for prof in all_profs:\n",
    "                relevant_prof = [d for d in all_data if d[\"p\"] == prof]\n",
    "                relevant_prof_m, relevant_prof_f = [d for d in relevant_prof if d[\"g\"] == \"m\"],  [d for d in relevant_prof if d[\"g\"] == \"f\"]\n",
    "                prof_m_train_dev, prof_m_test = sklearn.model_selection.train_test_split(relevant_prof_m, test_size=0.25, random_state = 0)\n",
    "                prof_m_train, prof_m_dev = sklearn.model_selection.train_test_split(prof_m_train_dev, test_size=0.1/0.75, random_state = 0)\n",
    "\n",
    "                prof_f_train_dev, prof_f_test = sklearn.model_selection.train_test_split(relevant_prof_f, test_size=0.25, random_state=0)\n",
    "                prof_f_train, prof_f_dev = sklearn.model_selection.train_test_split(prof_f_train_dev, test_size=0.1/0.75, random_state=0)\n",
    "                                \n",
    "                train.extend(prof_m_train + prof_f_train)\n",
    "                dev.extend(prof_m_dev + prof_f_dev)\n",
    "                test.extend(prof_m_test + prof_f_test)\n",
    "\n",
    "        np.random.seed(0)\n",
    "        random.seed(0)\n",
    "\n",
    "        random.shuffle(train)\n",
    "        random.shuffle(dev)\n",
    "        random.shuffle(test)\n",
    "        \n",
    "        pickle_data(train, output_dir + \"train\")\n",
    "        pickle_data(dev, output_dir + \"dev\")\n",
    "        pickle_data(test, output_dir + \"test\")\n",
    "        write_to_file(p2i, output_dir + \"profession2index\")\n",
    "        write_to_file(w2i, output_dir + \"word2index\")\n",
    "        write_to_file(g2i, output_dir + \"gender2index\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_path = \"D:\\\\Datasets\\\\biosbias-master\\\\BIOS.pkl\"\n",
    "output_dir = \"D:\\\\Datasets\\\\biosbias-master\\\\\"\n",
    "vocab_size = 250000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = load_data(input_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "preprocess(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "100%|██████████| 396189/396189 [02:15<00:00, 2934.12it/s]\n"
     ]
    }
   ],
   "source": [
    "split_train_dev_test(data = data, output_dir = output_dir, vocab_size = vocab_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ]
}